Quick and dirty first-pass DEG expression analysis for expt. SCL000091
Data Setup
Please copy over the SCL000091.Gene_Expression_Count.txt and the SCL000091_Sample_Metadata.csv files to your home directory that is accessible to the GRP workbench. The /B32NGS/Projects mount where the original files reside isn’t accessible from this server. I’ve copied over my files to this location: /fcrbiouatappn01/home/ssunkara1/SCL000091
My version of the metadata.csv has an additional column ‘grp’ to group samples into the 4 categories: WT, WT-SN38, S, S-SN38
Install DESeq2, Org.Hs.eg.db, clusterProfiler, a few plotting packages, zeallot, fgsea and devtools packages
load packages
read datasets into variables; please substitute paths to your files accordingly
count_data <- as.matrix(read.csv("/fcrbiouatappn01/home/ssunkara1/SCL000091/SCL000091.Gene_Expression_Count.txt",sep="\t",row.names="Gene_ID"))
metadata <- read.csv("/fcrbiouatappn01/home/ssunkara1/SCL000091/SCL000091_Sample_Metadata.csv",sep=",")
typecast grp variable as a factor and populate coldat
coldat <- data.frame(grp=factor(metadata$grp))
Create a DESeqDataSet object from the count matrix using the DESeqDataSetFromMatrix function Arbitrarily filter genes where the cumulative read count over all samples is <=10 (cumreadcount). Cumreadcount cut-off is empirical, and should be further tested. Run the DESeq function on the DESeqDataSet object which fits a GLM model based on the Negative Binomial distribution. DESeq also returns a dds (DESeqDataSetFromMatrix) object.
cumreadcount <- 10
ddsMat <- DESeqDataSetFromMatrix(countData = count_data, colData=coldat, design = ~ grp)
keep <- rowSums(counts(ddsMat)) > cumreadcount
dds <- ddsMat[keep,]
dds <- DESeq(dds)
PCA analysis to look for batch effects in the samples Variance stabilizing transformation (vst) is used for sample clustering. blind = TRUE option looks for across sample variability.
Extract the GLM results from the dds object & run comparisons Comparisons of interest between WT and WT-SN38, and then between S and S-SN38 from the fitted model are done using the contrasts argument to the results function, on the dds object.
res_WT_WTSN38 <- results(dds, contrast=c("grp", "WT", "WT-SN38"), tidy = TRUE)
res_S_SSN38 <- results(dds, contrast=c("grp", "S", "S-SN38"), tidy = TRUE)
data munging to extract differentially expressed genes from the above 2 comparisons The log fold change(lfc) and pval thresholds are arbitrarily chosen here. Some work needs to be done here to choose meaningful cut-offs based on empirical evidence in this expt.
tab_WT_WTSN38 <- data.frame(logFC = res_WT_WTSN38$log2FoldChange, negLogPval = -log10(res_WT_WTSN38$padj))
tab_S_SSN38 <- data.frame(logFC = res_S_SSN38$log2FoldChange, negLogPval = -log10(res_S_SSN38$padj))
pval <- 0.01
lfc <- 1.0
signGenes_WT_WTSN38 <- (abs(tab_WT_WTSN38$logFC) > lfc & tab_WT_WTSN38$negLogPval > -log10(pval))
signGenes_S_SSN38 <- (abs(tab_S_SSN38$logFC) > lfc & tab_S_SSN38$negLogPval > -log10(pval))
DEG_WT_WTSN38 <- res_WT_WTSN38[signGenes_WT_WTSN38,]
DEG_S_SSN38 <- res_S_SSN38[signGenes_S_SSN38,]
volcano plots between the WT vs. WT-SN38 and S vs. S-SN38 comparisons for DE Consider changing the lfc and pval cut-offs in the code-block above, to play around with cut-offs.
# plot both comparisons
volcanoplot <- function(tab, signGenes) {
plot(tab, pch = 16, cex = 0.6, xlab = expression(log[2]~fold~change), ylab = expression(-log[10]~pvalue))
# highlight over-expressed genes in red
points(tab[signGenes, ], pch = 16, cex = 0.6, col = "red")
# Indicate the pval cut-off as a green line
abline(h = -log10(pval), col = "green3", lty = 2)
# Indicate the fold change cut-offs as a blue line
abline(v = c(-lfc, lfc), col = "blue", lty = 2)
mtext(paste("pval =", pval), side = 4, at = -log10(pval), cex = 0.4, line = 0.3, las = 1)
mtext(c(paste("-", lfc, "fold"), paste("+", lfc, "fold")), side = 3, at = c(-lfc, lfc), cex = 0.5, line = 0.6)
}
par(mfrow=c(1,2))
volcanoplot(tab_WT_WTSN38, signGenes_WT_WTSN38)
volcanoplot(tab_S_SSN38, signGenes_S_SSN38)
catenate gene-names to the DEG_S_SSN38 and DEG_WT_WTSN38 dataframes & sort in decreasing order of fold change
GeneNamecount_data <- as.matrix(read.csv("/fcrbiouatappn01/home/ssunkara1/SCL000091/SCL000091_genes.count.txt",sep="\t"))
genenameDF <- GeneNamecount_data[,c("Gene_ID", "Gene_Name")]
DEG_WT_WTSN38 = merge(x=DEG_WT_WTSN38,y=genenameDF,by.x="row", by.y="Gene_ID")
DEG_WT_WTSN38 <- DEG_WT_WTSN38[order(-DEG_WT_WTSN38$log2FoldChange),]
DEG_S_SSN38 = merge(x=DEG_S_SSN38,y=genenameDF,by.x="row", by.y="Gene_ID")
DEG_S_SSN38 <- DEG_S_SSN38[order(-DEG_S_SSN38$log2FoldChange),]
Extract Genes that are common to both comparisons from above WT vs. WT-SN38 has more differentially expressed genes than the S vs. S-SN38
nrow(DEG_WT_WTSN38)
## [1] 3575
nrow(DEG_S_SSN38)
## [1] 2111
DEG_commongenes <- DEG_WT_WTSN38[DEG_WT_WTSN38$row %in% DEG_S_SSN38$row, ]
nrow(DEG_commongenes)
## [1] 1118
R function to generate genelist with log fold change.
genelistfunc <- function(DEGdf) {
genes_lfc <- DEGdf$log2FoldChange
DEGdf$row <- gsub("\\..*", '', DEGdf$row)
names(genes_lfc) <- DEGdf$row
list(genes_lfc, DEGdf)
}
Data frames and gene-lists for WT, SN38 and common-genes
c(commongenes_lfc, DEG_commongenes) %<-% genelistfunc(DEG_commongenes)
c(WT_WTSN38_lfc, DEG_WT_WTSN38) %<-% genelistfunc(DEG_WT_WTSN38)
c(S_SSN38_lfc, DEG_S_SSN38) %<-% genelistfunc(DEG_S_SSN38)
GO gene enrichment analysis and over-representation analysis Analysis is done using the ClusterProfiler package. Feel free to reevaluate enriched genes below, based on other ontology types like “BP” and “CC”.
GOenrichfunc <- function(genes_lfc, DEGdf) {
# Enrichment analysis
gse <- gseGO(geneList=genes_lfc, ont ="MF", keyType = "ENSEMBL", minGSSize = 3, maxGSSize = 800, pvalueCutoff = 0.05, verbose = TRUE, OrgDb = "org.Hs.eg.db", pAdjustMethod = "none", nPermSimple = 10000)
# Over representation analysis
overrep <- enrichGO(gene = DEGdf$row, keyType = "ENSEMBL", OrgDb = 'org.Hs.eg.db', ont = "MF", pAdjustMethod = "none", pvalueCutoff = 0.05, qvalueCutoff = 0.05, readable = TRUE)
list(gse, overrep)
}
c(gse_commongenes, overrep_commongenes) %<-% GOenrichfunc(commongenes_lfc, DEG_commongenes)
c(gse_WT, overrep_WT) %<-% GOenrichfunc(WT_WTSN38_lfc, DEG_WT_WTSN38)
c(gse_S, overrep_S) %<-% GOenrichfunc(S_SSN38_lfc, DEG_S_SSN38)
# plot dotplots for enrichement analysis. Feel free to test other comparisons, for example: WT or SN38
dotplot(gse_commongenes, showCategory=10, font.size=6, split=".sign") + facet_grid(.~.sign)
R function for pathway analysis. Pathway analysis using the gseKEGG function from clusterProfiler. clusterProfiler supports direct access to the latest KEGG database. KEGG analysis needs the gene_ids in Entrez format. Remapping the gene ids from Ensembl to Entrez format below. (Note that the ENSEMBL_id <-> Entrez_id conversion is not 100%). In this case, <1% of the common_genes with an Ensembl_id do not have an Entrez id.
pathwayfunc <- function(genelist_lfc, DEGdf) {
# map ENSEMBL to ENTREZIDs.
ids <- bitr(names(genelist_lfc), fromType = "ENSEMBL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db")
# dedup the ENSEMBL ids
dedup_ids <- ids[!duplicated(ids[c("ENSEMBL")]),]
# DEG dataframe with the deduped Ensembl ids
DEGdf <- DEGdf[DEGdf$row %in% dedup_ids$ENSEMBL,]
# add Entrez_ids to the DEGdf dataframe
DEGdf$ENTREZID = dedup_ids$ENTREZID
# generate kegg gene list with ENTREZ IDs and fold changes from the deduped DEGdf
kegg_gene_list <- DEGdf$log2FoldChange
names(kegg_gene_list) <- DEGdf$ENTREZID
# na shouldn't be an issue, but just in case. Follow by sort
kegg_gene_list<-na.omit(kegg_gene_list)
kegg_gene_list <- sort(kegg_gene_list, decreasing = TRUE)
# Look for over-represented pathways based on gene-names.
genes <- names(kegg_gene_list)
represented_pathway <- enrichKEGG(gene = genes, organism = 'hsa', pvalueCutoff = 0.05)
# Enrichment analysis using the gseKEGG function (leverages log2FC, and not just gene names).
enriched_pathway <- gseKEGG(geneList = kegg_gene_list, organism = 'hsa', nPermSimple = 10000, minGSSize =3, maxGSSize = 800, pvalueCutoff = 0.05, pAdjustMethod = "none", keyType = "ncbi-geneid")
# return the over-represented pathway, enriched pathway, modified dataframe as well as the sorted gene list with entrez ids.
list(represented_pathway, enriched_pathway, DEGdf, kegg_gene_list)
}
# run the above function for common-genes, WT DE genes, and SN38 DE genes
c(commongenes_Opathway, commongenes_pathway, DEG_commongenes, ENTREZ_commongenes) %<-% pathwayfunc(commongenes_lfc, DEG_commongenes)
c(WT_WTSN38_Opathway, WT_pathway, DEG_WT_WTSN38, ENTREZ_WT_WTSN38) %<-% pathwayfunc(WT_WTSN38_lfc, DEG_WT_WTSN38)
c(S_SSN38_Opathway, S_pathway, DEG_S_SSN38, ENTREZ_S_SSN38) %<-% pathwayfunc(S_SSN38_lfc, DEG_S_SSN38)
# You may want to look for over-represented pathways from the other comparisons, simply by changing out the *_Opathway variable below:
head(commongenes_Opathway, n=30) %>% as_tibble()
## # A tibble: 15 × 9
## ID Description GeneR…¹ BgRatio pvalue p.adj…² qvalue geneID Count
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> <int>
## 1 hsa04010 MAPK signaling… 35/467 294/81… 2.59e-5 0.00514 0.00412 26281… 35
## 2 hsa04310 Wnt signaling … 24/467 170/81… 3.31e-5 0.00514 0.00412 8313/… 24
## 3 hsa04330 Notch signalin… 12/467 59/8170 1.01e-4 0.0104 0.00834 55534… 12
## 4 hsa04020 Calcium signal… 28/467 240/81… 2.36e-4 0.0171 0.0137 26281… 28
## 5 hsa05226 Gastric cancer 20/467 149/81… 2.97e-4 0.0171 0.0137 26281… 20
## 6 hsa04060 Cytokine-cytok… 32/467 295/81… 3.31e-4 0.0171 0.0137 1235/… 32
## 7 hsa04925 Aldosterone sy… 15/467 98/8170 4.10e-4 0.0182 0.0146 57118… 15
## 8 hsa04929 GnRH secretion 11/467 64/8170 8.92e-4 0.0337 0.0270 776/5… 11
## 9 hsa04971 Gastric acid s… 12/467 76/8170 1.15e-3 0.0337 0.0270 3784/… 12
## 10 hsa04973 Carbohydrate d… 9/467 47/8170 1.17e-3 0.0337 0.0270 776/5… 9
## 11 hsa04921 Oxytocin signa… 19/467 154/81… 1.19e-3 0.0337 0.0270 5530/… 19
## 12 hsa04360 Axon guidance 21/467 182/81… 1.60e-3 0.0385 0.0308 10371… 21
## 13 hsa05410 Hypertrophic c… 13/467 90/8170 1.70e-3 0.0385 0.0308 1674/… 13
## 14 hsa05224 Breast cancer 18/467 147/81… 1.74e-3 0.0385 0.0308 26281… 18
## 15 hsa04625 C-type lectin … 14/467 104/81… 2.27e-3 0.0469 0.0376 5530/… 14
## # … with abbreviated variable names ¹GeneRatio, ²p.adjust
# dotplot of the enriched kegg pathways
# dotplots for enriched kegg pathways from other comparisons could similarly be visualized by changing out the *_pathway variable.
dotplot(commongenes_pathway, showCategory = 10, font.size = 6, title = "Enriched Pathways" , split=".sign") + facet_grid(.~.sign)
Specific pathway visualization You maybe interested in reviewing any of the KEGG pathways closely for useful interrogation. The pathview functionality offers you the ability to view upregulated and downregulated genes within the context of the specific pathway.
# reviewing the top hit (MAPK signaling pathway) in the KEGG over representation analysis.
# This will bring up a separate window.
pathview(gene.data = ENTREZ_commongenes, pathway.id = "hsa04010", species = "hsa")
KEGG, REACTOME, and WIKI Pathway over representation analysis of the common gene set with the gprofiler2 package The KEGG pathway results should come out to be the same with the clusterprofiler package results. Differences between KEGG and REACTOME pathway results are because of underlying differences in the databases.
gostres <- gost(DEG_commongenes$row, organism = "hsapiens", ordered_query = TRUE,multi_query = TRUE, significant = TRUE, exclude_iea = FALSE,measure_underrepresentation = FALSE,evcodes = TRUE, source = c("KEGG", "REAC", "WP"), user_threshold = 0.05, as_short_link = FALSE, domain_scope="known", correction_method = "g_SCS")
# View the top 30 hits
head(gostres$result, n=30) %>% as_tibble()
## # A tibble: 30 × 11
## term_id p_val…¹ signi…² term_…³ query…⁴ inter…⁵ source term_…⁶ effec…⁷
## <chr> <list> <list> <int> <list> <list> <chr> <chr> <int>
## 1 REAC:0000000 <dbl> <lgl> 10770 <int> <int> REAC REACTO… 61498
## 2 KEGG:00000 <dbl> <lgl> 7559 <int> <int> KEGG KEGG r… 61498
## 3 WP:000000 <dbl> <lgl> 7364 <int> <int> WP WIKIPA… 61498
## 4 REAC:R-HSA-16… <dbl> <lgl> 2523 <int> <int> REAC Signal… 61498
## 5 KEGG:01100 <dbl> <lgl> 1457 <int> <int> KEGG Metabo… 61498
## 6 REAC:R-HSA-14… <dbl> <lgl> 2075 <int> <int> REAC Metabo… 61498
## 7 KEGG:05200 <dbl> <lgl> 517 <int> <int> KEGG Pathwa… 61498
## 8 REAC:R-HSA-16… <dbl> <lgl> 1694 <int> <int> REAC Disease 61498
## 9 REAC:R-HSA-16… <dbl> <lgl> 2041 <int> <int> REAC Immune… 61498
## 10 REAC:R-HSA-59… <dbl> <lgl> 1414 <int> <int> REAC Post-t… 61498
## # … with 20 more rows, 2 more variables: source_order <int>, parents <list>,
## # and abbreviated variable names ¹p_values, ²significant, ³term_size,
## # ⁴query_sizes, ⁵intersection_sizes, ⁶term_name, ⁷effective_domain_size
Interactive plot from the above dataset
gostplot(gostres, interactive = TRUE, capped = TRUE)
More pathway over-representation analysis gprofiler2 and msigdb share KEGG, REACTOME and WIKI PATHWAYS in common. msigdb additionally has Biocarta and PID pathway databases. To lookup pathway representations in either of those databases, these are the commands. For the common genes, no pathways were detected in either biocarta or the PID datasets
## Extract gene sets from the category C2, subcategory BIOCARTA from msigdb
## msigdbr_collections() to look up the categories and subcategories
biocarta_gene_sets = msigdbr(species = "human", category = "C2", subcategory = "CP:BIOCARTA")
## Generate a data frame with entrez ids
biocarta_t2g <- biocarta_gene_sets %>% dplyr::distinct(gs_name, entrez_gene) %>% as.data.frame()
## Look for enriched pathways in the common gene list from the Biocarta pathway database
## List should come back empty.
enrichedBiocarta <- enricher(gene = names(ENTREZ_commongenes), TERM2GENE = biocarta_t2g)
enrichedBiocarta$ID %>% as_tibble()
## # A tibble: 0 × 1
## # … with 1 variable: value <chr>
For Transcription factor motif over-representation as well as network analysis, the ChEA3 package is chosen
library(httr)
library(jsonlite)
url <- "https://maayanlab.cloud/chea3/api/enrich/"
encode <- "json"
payload <- list(query_name = "TFBSQuery", gene_set = DEG_commongenes$Gene_Name)
#POST to ChEA3 server
response <- POST(url = url, body = payload, encode = encode)
json <- content(response, "text")
#results as list of R dataframes
results <- fromJSON(json)
# recommendation is to look at the Integrated meanRank results, because that metric performed the best in their benchmark evaluation
results$`Integrated--meanRank` %>% as_tibble()
## # A tibble: 1,632 × 6
## `Query Name` Rank TF Score Library Overl…¹
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 TFBSQuery 1 BHLHE22 24.0 ARCHS4 Coexpression,29;ReMap ChIP-s… EXOC3L…
## 2 TFBSQuery 2 CXXC5 32.0 ARCHS4 Coexpression,15;GTEx Coexpre… KCNC3,…
## 3 TFBSQuery 3 ZNF385B 42.33 ARCHS4 Coexpression,48;Enrichr Quer… STEAP4…
## 4 TFBSQuery 4 ZBTB18 49.5 ARCHS4 Coexpression,7;GTEx Coexpres… ATP8A1…
## 5 TFBSQuery 5 ETV1 52.0 ARCHS4 Coexpression,31;Enrichr Quer… IPO13,…
## 6 TFBSQuery 6 MKX 59.67 ARCHS4 Coexpression,71;Enrichr Quer… NRP1,C…
## 7 TFBSQuery 7 NR4A3 65.67 ARCHS4 Coexpression,45;Enrichr Quer… CSRNP1…
## 8 TFBSQuery 8 MYT1 70.33 ARCHS4 Coexpression,115;Enrichr Que… CDKN1A…
## 9 TFBSQuery 9 NKX63 70.5 ARCHS4 Coexpression,90;GTEx Coexpre… RAB3B,…
## 10 TFBSQuery 10 ZNF488 72.5 ARCHS4 Coexpression,17;Enrichr Quer… RAB3B,…
## # … with 1,622 more rows, and abbreviated variable name ¹Overlapping_Genes